Find out how many UP and DOWN genes are also present in the
consensus signature
upInDf <-dff[dff$gene %in% up_genes$gene,]
setdiff(up_genes$gene,dff$gene)
## [1] "ACKR2" "R3HDML"
upInDf <-upInDf[order(upInDf$logFC, decreasing = TRUE),]
upInDf_sg <-upInDf[upInDf$`pv-adj` <= 0.05,]
dwInDf <-dff[dff$gene %in% dw_genes$gene,]
setdiff(dw_genes$gene,dff$gene)
## character(0)
dwInDf <-dwInDf[order(dwInDf$logFC, decreasing = TRUE),]
dwInDf_sg <-dwInDf[dwInDf$`pv-adj` <= 0.05,]
Venn Plot of Consensus genes and DE Validation genes
up_v <-dff[dff$logFC >=1.5 & dff$`pv-adj` <= 0.05,]
up_v <-up_v[complete.cases(up_v),] #315
dw_v <-dff[dff$logFC <= -1.5 & dff$`pv-adj` <= 0.05,]
dw_v <-dw_v[complete.cases(dw_v),] #320
DE_Validation <- c(up_v$gene,dw_v$gene)
consensus_sig <- c(up_genes$gene,dw_genes$gene)
length(intersect(dw_v$gene,dw_genes$gene))
## [1] 3
length(intersect(up_v$gene,up_genes$gene))
## [1] 28
dff2 <-dff[dff$gene %in% DE_Validation,]
dff2 <-dff2[complete.cases(dff2$gene),]
v <-list(DE_Validation=DE_Validation, consensus_sig=consensus_sig)
names(v) <-c("DE genes in Validation Dataset","Consensus signature")
ggvenn(v, show_elements = F,show_percentage = T, text_size =17, set_name_size = 10,
stroke_size = 0.5, fill_color = brewer.pal(name="Set1",n=4))+
ggtitle("Consensus genes differentially expressed in Validation dataset",
subtitle = "Validation DE: log2FC threshold = |1.5|, pv_adj <= 0.05")+
theme(plot.title = element_text(hjust = 0.5,vjust = 1, size = 25),
plot.subtitle = element_text(hjust = 0.5 ,vjust = 1, size = 15))

Fisher test to assess significance for the intersection
load(paste0(MEDIAsave,"combined_full_list.RData"))
conting_table <-data.frame("sign_in_Val"=rep(NA,2),"NOTsign_in_Val"=rep(NA,2))
rownames(conting_table) <-c("sign_in_Cons","NOTsign_in_Cons")
conting_table["sign_in_Cons","sign_in_Val"] <-length(intersect(DE_Validation,consensus_sig)) #31
conting_table["sign_in_Cons","NOTsign_in_Val"] <-length(setdiff(consensus_sig,DE_Validation)) #13
comb_notSig <-combined[!combined$gene %in% consensus_sig,] #13120=13164-44
Val_notSig <-dff[!dff$gene %in% dff2$gene,] #11811=12446-635
Val_notCons <-dff2[!dff2$gene %in% consensus_sig,]
conting_table["NOTsign_in_Cons","sign_in_Val"] <-nrow(Val_notCons) #604
conting_table["NOTsign_in_Cons","NOTsign_in_Val"]<-length(intersect(comb_notSig$gene,Val_notSig$gene)) #10241
conting_table
## sign_in_Val NOTsign_in_Val
## sign_in_Cons 31 13
## NOTsign_in_Cons 604 10241
fisher.test(conting_table)
##
## Fisher's Exact Test for Count Data
##
## data: conting_table
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 20.40468 84.37084
## sample estimates:
## odds ratio
## 40.38563
rm(comb_notSig,Val_notCons,Val_notSig)
write_xlsx(dff2, path=paste0(MEDIAsave,"DEValidationSelected.xlsx"))
png(file = paste0(MEDIAgraph,"Fig5a_venn.png"), width = 14, height = 8, res=300, units = "in")
ggvenn(v, show_elements = F,show_percentage = T, text_size =14, set_name_size = 9,
stroke_size = 0.5, fill_color = brewer.pal(name="Set1",n=4))+
ggtitle("Consensus genes differentially expressed in Validation dataset",
subtitle = "Validation DE: log2FC threshold = |1.5|, pv_adj <= 0.05")+
theme(plot.title = element_text(hjust = 0.5,vjust = 1, size = 25),
plot.subtitle = element_text(hjust = 0.5 ,vjust = 1, size = 15,margin = margin(0,0,15,0)))
dev.off()
rm(dff2)
norm4dataset <- as.data.frame(read_excel("~/MEDIA Project/GSE114007_raw_counts_4dataset.xlsx", sheet=1)) #Normal samples data
rownames(norm4dataset) <-norm4dataset$symbol
norm4dataset <-norm4dataset[,-1]
dim(norm4dataset)
## [1] 23710 18
oa4dataset <- as.data.frame(read_excel("~/MEDIA Project/GSE114007_raw_counts_4dataset.xlsx", sheet=2)) #OA samples data
rownames(oa4dataset) <-oa4dataset$symbol
oa4dataset <-oa4dataset[,-1]
dim(oa4dataset)
## [1] 23710 20
norm4dataset <-norm4dataset[rownames(oa4dataset),]
newdataset <-cbind(norm4dataset,oa4dataset)
coldata <-data.frame(class=c(rep("N",18),rep("OA",20)))
coldata$class <-as.factor(coldata$class)
coldata$info <-colnames(newdataset)
coldata$condition <-"cart"
coldata[9:18,"condition"] <-"all"
coldata[19:28,"condition"] <-"cart"
coldata[29:38,"condition"] <-"all"
coldata$condition <-as.factor(coldata$condition)
rownames(coldata) <-coldata$info
newdataset <- newdataset[, rownames(coldata)]
save(newdataset, file=paste0(MEDIAsave,"datasetValidation_full.RData"))
dds <- DESeqDataSetFromMatrix(countData = newdataset,
colData = coldata,
design = ~condition + class)
keep <- rowSums(assay(dds)>=5) >= 19 #at least 50% of the patients
dds <- dds[keep,]
dds$class <- factor(dds$class, levels = c("N","OA"))
dds$class <- relevel(dds$class, ref = "N")
dds$condition <- factor(dds$condition, levels = c("all","cart"))
nrow(dds)
## [1] 14643
Batch correction
mm <- model.matrix(~class, colData(vsd))
assay(vsd)<-removeBatchEffect(assay(vsd), batch = vsd$condition,design=mm)
pcaData <- plotPCA(vsd, intgroup=c("class", "condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=class, shape=condition,label = rownames(pcaData))) +
geom_point(size=3,alpha=1) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()+ggtitle("All samples without batch")+ theme_bw()

HEATMAP of consensus signature on the Validation dataset (z-score),
batch corrected
normdata <-assay(vsd)
normdata1 <-t(apply(normdata,1,scale))
colnames(normdata1) <-colnames(normdata)
normdata <-normdata1
signature <-c(up_genes$gene,dw_genes$gene)
setdiff(signature,rownames(normdata)) #""ACKR2" "TMEM59L" "R3HDML" UP gene missing
## [1] "ACKR2" "TMEM59L" "R3HDML"
up2 <-up_genes$gene[up_genes$gene %in% rownames(normdata)]
normdata <-normdata[c(up2,dw_genes$gene),]
dim(normdata)
## [1] 41 38
coldataAnn <-coldata
coldataAnn$col2 <-"plum"
coldataAnn[coldataAnn$class=="OA","col2"] <-"cyan"
rowdata <-data.frame(genes=c(up2,dw_genes$gene))
rowdata$col3 <-c(rep("red",36),rep("blue",5))
heatmap3(normdata[rowdata$genes,rownames(coldataAnn)],
scale="none",
col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
RowAxisColors = 1,
balanceColor = TRUE,
margins = c(16,10),
ColSideColors = coldataAnn$col2,
ColSideLabs = "Class",
RowSideColors = rowdata$col3,
RowSideLabs = "DE genes integration",
cexRow = 0.9, cexCol = 0.9)
legend(0.8,1,legend=c("OA","Normal"),
fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.9,1,legend=c("UP","DOWN"),
fill=c("red","blue"), bty = "n", title = "DE")

png(file = paste0(MEDIAgraph,"Fig5b_ValHeatmap.png"), width = 15, height = 12.4, res=300, units = "in")
heatmap3(normdata[rowdata$genes,rownames(coldataAnn)],
scale="none",
col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
RowAxisColors = 1,
balanceColor = TRUE,
margins = c(16,10),
ColSideColors = coldataAnn$col2,
ColSideLabs = "Class",
RowSideColors = rowdata$col3,
RowSideLabs = "DE genes integration",
cexRow = .8, cexCol = .9)
legend(0.8,1,legend=c("OA","Normal"),
fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.9,1,legend=c("UP","DOWN"),
fill=c("red","blue"), bty = "n", title = "DE")
dev.off()
Enrichment analysis of consensus signature on Validation Dataset:
GSEA Running Plot
sig <-data.frame(gs_name="Consensus signature",gene_symbol=c(up_genes$gene,dw_genes$gene))
colnames(sig) <-c("gs_name","gene_symbol")
colnames(sig) <-c("term_id","gene_id")
gene <-dff$logFC
names(gene) <-dff$gene
gene <-sort(gene,decreasing = TRUE)
set.seed(2459)
gs <- clusterProfiler::GSEA(gene,TERM2GENE = sig,
minGSSize = 5,
eps = 0,
pAdjustMethod = "fdr",
pvalueCutoff =0.01)
gseaplot2(gs, title ="Running score GSEA plot - validation dataset", geneSetID = 1, pvalue_table = T)

Session Info
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] writexl_1.4.2 enrichplot_1.21.0
## [3] clusterProfiler_4.4.4 msigdbr_7.5.1
## [5] heatmap3_1.1.9 limma_3.52.4
## [7] DESeq2_1.36.0 SummarizedExperiment_1.26.1
## [9] Biobase_2.56.0 MatrixGenerics_1.8.1
## [11] matrixStats_1.0.0 GenomicRanges_1.48.0
## [13] GenomeInfoDb_1.32.4 IRanges_2.30.1
## [15] S4Vectors_0.34.0 BiocGenerics_0.42.0
## [17] RColorBrewer_1.1-3 ggvenn_0.1.10
## [19] lubridate_1.9.2 forcats_1.0.0
## [21] stringr_1.5.0 dplyr_1.1.3
## [23] purrr_1.0.1 readr_2.1.4
## [25] tidyr_1.3.0 tibble_3.2.1
## [27] ggplot2_3.4.3 tidyverse_2.0.0
## [29] readxl_1.4.3
##
## loaded via a namespace (and not attached):
## [1] shadowtext_0.1.2 fastmatch_1.1-4 plyr_1.8.8
## [4] igraph_1.5.0 lazyeval_0.2.2 splines_4.2.1
## [7] BiocParallel_1.30.4 digest_0.6.32 yulab.utils_0.0.6
## [10] htmltools_0.5.6 GOSemSim_2.22.0 viridis_0.6.3
## [13] GO.db_3.15.0 fansi_1.0.4 magrittr_2.0.3
## [16] memoise_2.0.1 tzdb_0.4.0 fastcluster_1.2.3
## [19] Biostrings_2.64.1 annotate_1.74.0 graphlayouts_1.0.0
## [22] timechange_0.2.0 colorspace_2.1-0 blob_1.2.4
## [25] ggrepel_0.9.3 xfun_0.39 crayon_1.5.2
## [28] RCurl_1.98-1.12 jsonlite_1.8.7 scatterpie_0.2.1
## [31] genefilter_1.78.0 survival_3.5-5 ape_5.7-1
## [34] glue_1.6.2 polyclip_1.10-4 gtable_0.3.4
## [37] zlibbioc_1.42.0 XVector_0.36.0 DelayedArray_0.22.0
## [40] scales_1.2.1 DOSE_3.22.1 DBI_1.1.3
## [43] Rcpp_1.0.11 viridisLite_0.4.2 xtable_1.8-4
## [46] gridGraphics_0.5-1 tidytree_0.4.2 bit_4.0.5
## [49] httr_1.4.7 fgsea_1.22.0 pkgconfig_2.0.3
## [52] XML_3.99-0.14 farver_2.1.1 sass_0.4.7
## [55] locfit_1.5-9.8 utf8_1.2.3 ggplotify_0.1.0
## [58] tidyselect_1.2.0 labeling_0.4.3 rlang_1.1.1
## [61] reshape2_1.4.4 AnnotationDbi_1.58.0 munsell_0.5.0
## [64] cellranger_1.1.0 tools_4.2.1 cachem_1.0.8
## [67] downloader_0.4 cli_3.6.1 generics_0.1.3
## [70] RSQLite_2.3.1 evaluate_0.21 fastmap_1.1.1
## [73] yaml_2.3.7 ggtree_3.6.2 babelgene_22.9
## [76] knitr_1.43 bit64_4.0.5 tidygraph_1.2.3
## [79] KEGGREST_1.36.3 ggraph_2.1.0 nlme_3.1-162
## [82] aplot_0.1.10 DO.db_2.9 compiler_4.2.1
## [85] rstudioapi_0.15.0 png_0.1-8 treeio_1.20.2
## [88] tweenr_2.0.2 geneplotter_1.74.0 bslib_0.5.1
## [91] stringi_1.7.12 highr_0.10 lattice_0.21-8
## [94] Matrix_1.5-4.1 vctrs_0.6.3 pillar_1.9.0
## [97] lifecycle_1.0.3 jquerylib_0.1.4 data.table_1.14.8
## [100] bitops_1.0-7 patchwork_1.1.3 qvalue_2.28.0
## [103] R6_2.5.1 gridExtra_2.3 codetools_0.2-19
## [106] MASS_7.3-58.3 withr_2.5.0 GenomeInfoDbData_1.2.8
## [109] parallel_4.2.1 hms_1.1.3 ggfun_0.1.0
## [112] rmarkdown_2.24 ggforce_0.4.1